The following dataset comes from a 2014 survey conducted by Open
Sourcing Mental Health (formerly OSMI), aiming to measure tech
professional’s attitudes towards mental health and the frequency of
mental health disorders within the field of technology.
# load in necessary libraries
library(readr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(RColorBrewer)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(viridis)
## Loading required package: viridisLite
library(ggalt)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
library(ggcorrplot)
library(reshape2)
library(relaimpo)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: boot
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
##
## aml
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
# read in dataset
mental_health1 <- read.csv("/Users/kayleetringali/STAT442 Final/survey_2014.csv")
summary(mental_health1)
## Timestamp Age Gender Country
## Length:1259 Min. :-1.726e+03 Length:1259 Length:1259
## Class :character 1st Qu.: 2.700e+01 Class :character Class :character
## Mode :character Median : 3.100e+01 Mode :character Mode :character
## Mean : 7.943e+07
## 3rd Qu.: 3.600e+01
## Max. : 1.000e+11
## state self_employed family_history treatment
## Length:1259 Length:1259 Length:1259 Length:1259
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## work_interfere no_employees remote_work tech_company
## Length:1259 Length:1259 Length:1259 Length:1259
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## benefits care_options wellness_program seek_help
## Length:1259 Length:1259 Length:1259 Length:1259
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## anonymity leave mental_health_consequence
## Length:1259 Length:1259 Length:1259
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## phys_health_consequence coworkers supervisor
## Length:1259 Length:1259 Length:1259
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## mental_health_interview phys_health_interview mental_vs_physical
## Length:1259 Length:1259 Length:1259
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## obs_consequence comments
## Length:1259 Length:1259
## Class :character Class :character
## Mode :character Mode :character
##
##
##
C: Condensing ‘Gender’ into 3 categories for simplicity and viewing
the distribution
# replace various gender responses with standardized categories
mental_health2$Gender <- gsub("(?i)^(?=.*\\b(?:male|m|maile|malr|msle|make)\\b).*$", "Male", mental_health2$Gender, perl = TRUE)
mental_health2$Gender <- gsub("(?i)^(?=.*\\b(?:female|f|femake|mal)\\b).*$", "Female", mental_health2$Gender, perl = TRUE)
mental_health2$Gender[mental_health2$Gender == "Guy (-ish) ^_^"] <- "Other" # replace exact string value
# remove leading/trailing whitespaces
mental_health2$Gender <- trimws(mental_health2$Gender)
# convert non-Male/Female entries to 'Other'
mental_health2$Gender[!(mental_health2$Gender %in% c("Male", "Female"))] <- "Other"
# create a table with counts of each gender category
gender_counts <- table(mental_health2$Gender)
# convert the table into a data frame
gender_counts_df <- as.data.frame(gender_counts)
names(gender_counts_df) <- c("Gender", "Count")
# create a pie chart using plotly
gender_fig <- plot_ly(gender_counts_df, labels = ~Gender, values = ~Count, type = 'pie', hole = 0.4) %>%
layout(title = "Distribution of Gender Categories", x = 0.5, font = list(size = 13, color = "black", family = "Arial", weight = "bold"))
# show the plot
gender_fig
## Warning: 'layout' objects don't have these attributes: 'x'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
# create a table with counts of each age range
age_counts <- table(mental_health2$Age)
# convert the table into a data frame
age_counts_df <- as.data.frame(age_counts)
names(age_counts_df) <- c("Age", "Count")
# create a gradient fill based on the count values
ggplot(age_counts_df, aes(x = Age, y = Count, fill = Count)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "yellow", high = "red") + # Define the gradient colors
labs(title = "Age Distribution", x = "Age", y = "Count") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
plot.title = element_text(hjust = 0.5, face = "bold"))

# create a kernel density plot segmented by treatment categories
ggplot(data = mental_health2, aes(x = Age, fill = treatment)) +
geom_density(alpha = 0.5) +
labs(title = "Kernel Density Plot of Age by Treatment", x = "Age", y = "Density") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
scale_fill_manual(name = "Treatment Status", # Change the legend title
values = c("Yes" = "seagreen", "No" = "purple")) # modify legend colors

# filter data where treatment is 'Yes'
treated_countries <- mental_health2 %>%
filter(treatment == 'Yes') %>%
group_by(Country) %>%
summarise(count = n()) %>%
top_n(10, count) %>%
arrange(desc(count))
# filter data where treatment is 'No'
not_treated_countries <- mental_health2 %>%
filter(treatment == 'No') %>%
group_by(Country) %>%
summarise(count = n()) %>%
top_n(10, count) %>%
arrange(desc(count))
# select top 10 treating and not treating countries
treated_countries <- head(treated_countries, 10)
not_treated_countries <- head(not_treated_countries, 10)
# merge the treated and not treated country data
all_countries <- rbind(
transform(treated_countries, treatment_status = "Treating"),
transform(not_treated_countries, treatment_status = "Not Treating")
)
# order the merged dataframe by count
all_countries <- all_countries[order(all_countries$count),]
# Create a bar plot for treating and not treating countries
ggplot(all_countries, aes(x = reorder(Country, count), y = count, fill = treatment_status)) +
geom_bar(stat = "identity", position = "dodge", width = 0.6) +
labs(title = "Top 10 Countries - Treating vs. Not Treating Mental Health Issues",
x = "Country", y = "Frequency") +
scale_fill_manual(name = "Treatment Status", values = c("Treating" = "seagreen", "Not Treating" = "purple")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
coord_flip() +
theme(
legend.title = element_text(face = NULL),
plot.title = element_text(hjust = 0.5, face = "bold")
)

# create a bar plot of anticipated mental health consequences and treatment
# create custom color palette
custom_colors <- c("seagreen", "skyblue")
# create the plot with customizations
ggplot(mental_health2, aes(x = mental_health_consequence, fill = factor(treatment))) +
geom_bar(position = "dodge") +
scale_fill_manual(values = custom_colors, name = "Treatment") +
labs(x = "Mental Health Consequence", y = "Frequency", title = "Frequency of Mental Health Consequence by Treatment") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
legend.title = element_text(face = "bold"),
legend.position = "top"
)

# map categorical variables to numerical values
mental_health3$family_history_num <- ifelse(mental_health3$family_history == "No", 0, 1)
mental_health3$treatment_num <- ifelse(mental_health3$treatment == "No", 0, 1)
mental_health3$self_employed_num <- ifelse(mental_health3$self_employed == "No", 0, 1)
mental_health3$remote_work_num <- ifelse(mental_health3$remote_work == "No", 0, 1)
# convert categorical variables with multiple categories to numerical
mental_health3$benefits_num <- ifelse(mental_health2$benefits == "No", 0, ifelse(mental_health2$benefits == "Yes", 1, 2))
mental_health3$wellness_programs_num <- ifelse(mental_health2$wellness_program == "No", 0, ifelse(mental_health3$wellness_program == "Yes", 1, 2))
mental_health3$seek_help_num <- ifelse(mental_health3$seek_help == "No", 0, ifelse(mental_health3$seek_help == "Yes", 1, 2))
mental_health3$anonymity_num <- ifelse(mental_health3$anonymity == "No", 0, ifelse(mental_health3$anonymity == "Yes", 1, 2))
mental_health3$mental_health_consequence_num <- ifelse(mental_health3$mental_health_consequence == "No", 0, ifelse(mental_health3$mental_health_consequence == "Yes", 1, 2))
mental_health3$phys_health_consequence_num <- ifelse(mental_health3$phys_health_consequence == "No", 0, ifelse(mental_health3$phys_health_consequence == "Yes", 1, 2))
# select numerical columns for correlation analysis
numerical_data <- mental_health3[, c("family_history_num", "treatment_num", "self_employed_num", "remote_work_num",
"benefits_num", "wellness_programs_num", "seek_help_num", "anonymity_num",
"mental_health_consequence_num", "phys_health_consequence_num")]
# create correlation matrix
correlation_matrix <- cor(numerical_data)
# plot the correlation heatmap with ggcorrplot
ggcorrplot(correlation_matrix, hc.order = TRUE,
type = "lower", lab = TRUE, lab_size = 3,
method = "circle", outline.color = "white",
colors = c("blue", "white", "red"),
title = "Correlation Heatmap of Numerical Variables")

# create correlation matrix
correlation_matrix <- cor(numerical_data)
# melt the correlation matrix into long format
correlation_melted <- melt(correlation_matrix)
ggplot(correlation_melted, aes(Var1, Var2, fill = value)) +
geom_tile() +
geom_text(aes(label = round(value, 2)), size = 3) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0,
limits = c(-1, 1), na.value = "grey50") +
labs(title = "Correlation Heatmap of Numerical Variables") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, face = "bold")) # rotate x-axis labels

# quantitative analysis
# convert categorical variables to factors
mental_health2$treatment <- as.factor(mental_health2$treatment)
mental_health2$mental_health_consequence <- as.factor(mental_health2$mental_health_consequence)
# convert factors to numeric
mental_health2$treatment_numeric <- as.numeric(mental_health2$treatment) - 1 # Assuming 'No' = 0 and 'Yes' = 1
mental_health2$mental_health_consequence_numeric <- as.numeric(mental_health2$mental_health_consequence) - 1 # Assuming 'No' = 0, 'Maybe' = 1, 'Yes' = 2
# fit the linear regression model
model <- lm(treatment_numeric ~ mental_health_consequence_numeric, data = mental_health2)
summary(model)
##
## Call:
## lm(formula = treatment_numeric ~ mental_health_consequence_numeric,
## data = mental_health2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5258 -0.5075 0.4743 0.4925 0.5107
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.48930 0.02113 23.161 <2e-16 ***
## mental_health_consequence_numeric 0.01823 0.01845 0.988 0.323
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5002 on 1248 degrees of freedom
## Multiple R-squared: 0.0007815, Adjusted R-squared: -1.913e-05
## F-statistic: 0.9761 on 1 and 1248 DF, p-value: 0.3234